# Source code: defines functions to run bootstraps and extract relevant parameters #
# Note that you will need to load/install the same set of packages to run the source code (e.g., dplyr) #

# Functions

#perform_bootstrap     -> runs bootstraps and sets seed
#extract_short         -> does the exact same thing, but does not save results from OLS models
#prop                  -> calculate proportion of respondents within a categorical variable, used for plotting
#format_fixed_decimals -> force decimal spaces when merging different rows/columns to 2 or 3, used for tabular output
#make_Tab              -> select relevant columns to make a table, used for tabular output

# Function to perform bootstrapping for a given data frame
# Note: generation defaults to genPew but can be specified manually when calling the function
perform_bootstrap <- function(data_frame, n_bootstraps, include_controls = TRUE, gen = "genPew") {
  # Set list by n bootstraps
  boot_results <- vector("list", n_bootstraps)
  # Repeat the seed 
  # This is necessary, since the seed determines which sample will be drawn and has to be reset every time in the loop
  set.seed(28)
  # Loop to generate bootstraps and save the results
  for (i in 1:n_bootstraps) {
    
    model_results <- list()
    
    # Keep track of progress, the bootstraps will take a considerable amount of time
    cat(paste("Bootstrap Iteration:", i, "out of", n_bootstraps, "\n"))
    flush.console()
    
    # Generate bootstrap sample
    boot_sample <- data_frame[sample(nrow(data_frame), replace = TRUE), ]
    
    # Define model formula based on whether to include controls
    formula_mean <- if (include_controls) {
      formula(paste("att ~", gen, "+ year + ageGroup + protcattend + white + college", sep = " "), data = boot_sample)
    } else {
      formula(paste("att ~", gen, "+ year + ageGroup", sep = " "), data = boot_sample)
    }
    
    # Party identification (means)
    formula_pid <- if (include_controls) {
      formula(paste("partyid ~", gen, "+ year + ageGroup + protcattend + white + college", sep = " "), data = boot_sample)
    } else {
      formula(paste("partyid ~", gen, "+ year + ageGroup", sep = " "), data = boot_sample)
    }
    
    # Divergence
    formula_div <- if (include_controls) {
      formula(paste("div ~", gen, "+ year + ageGroup + protcattend + white + college", sep = " "), data = boot_sample)
    } else {
      formula(paste("div ~", gen, "+ year + ageGroup", sep = " "), data = boot_sample)
    }
    
    # Divergence PID
    formula_pid_div <- if (include_controls) {
      formula(paste("div_pid ~", gen, "+ year + ageGroup + protcattend + white + college", sep = " "), data = boot_sample)
    } else {
      formula(paste("div_pid ~", gen, "+ year + ageGroup", sep = " "), data = boot_sample)
    }
    
    # Estimate mean-levels  
    est_mean  <- lm(formula_mean, data = boot_sample)
    pid_mean  <- lm(formula_pid, data = boot_sample)
    
    boot_sample$div      <- abs(est_mean$residuals)
    boot_sample$div_pid  <- abs(pid_mean$residuals)
    
    # Compute divergence values based on whether the flag is set for control variables
    if (include_controls) {
      est_div <- lm(formula_div, data = boot_sample)
      pid_div <- lm(formula_pid_div, data = boot_sample)
      div         <- abs(est_div$residuals)
      div_pid     <- abs(pid_div$residuals)
    } else {
      div         <- abs(lm(formula_div, data = boot_sample)$residuals)
      div_pid     <- abs(lm(formula_pid_div, data = boot_sample)$residuals)
    }
    
    # Estimate divergence
    est_div <- lm(formula_div, data = boot_sample)
    pid_div <- lm(formula_pid_div, data = boot_sample)
    
    # We do the same to calculate sorting values
    if (include_controls) {
      fitted_values_div     <- est_div$fitted.values
      fitted_values_pid_div <- pid_div$fitted.values
    } else {
      fitted_values_div     <- lm(formula_div, data = boot_sample)$fitted.values
      fitted_values_pid_div <- lm(formula_pid_div, data = boot_sample)$fitted.values
    }
    
    # Compute sorting values
    boot_sample$sort_dv <- (boot_sample$att - est_mean$fitted.values) / est_div$fitted.values
    boot_sample$sort_iv <- (boot_sample$partyid  - pid_mean$fitted.values) / pid_div$fitted.values
    
    # Estimate sorting
    formula_sort <- if (include_controls) {
      formula(paste("sort_dv ~", gen, " * sort_iv + year * sort_iv + ageGroup * sort_iv + protcattend * sort_iv + white * sort_iv + college * sort_iv", sep = " "), data = boot_sample)
    } else {
      formula(paste("sort_dv ~", gen, " * sort_iv + year * sort_iv + ageGroup * sort_iv", sep = " "), data = boot_sample)
    }
    
    # Run model
    est_sort <- lm(formula_sort, data = boot_sample)
    
    # Save coefficients and number of bootstraps
    boot_results[[i]] <- list(coef_Mean    = coef(est_mean), 
                              coef_Div     = coef(est_div), 
                              coef_Sort    = coef(est_sort),
                              R2_Mean      = summary(est_mean)$r.squared,
                              R2_Div       = summary(est_div)$r.squared,
                              R2_Sort      = summary(est_sort)$r.squared,
                              adj.R2_Mean  = summary(est_mean)$adj.r.squared,
                              adj.R2_Div   = summary(est_div)$adj.r.squared,
                              adj.R2_Sort  = summary(est_sort)$adj.r.squared,
                              R            = paste(n_bootstraps),
                              nobs         = paste(nobs(est_mean)),
                              controls     = include_controls)
  }
  
  # Return
  return(boot_results)
}

# Define a function to process bootstrapped results
extract <- function(boot_results, data_frame) {
  
  # Extract coefficients for mean, div, and sort
  coef_mean  <- sapply(boot_results, function(estimate) estimate$coef_Mean)
  coef_div   <- sapply(boot_results, function(estimate) estimate$coef_Div)
  coef_sort  <- sapply(boot_results, function(estimate) estimate$coef_Sort)
  
  # Calculate standard deviations
  se_mean  <- apply(coef_mean, 1, sd)
  se_div   <- apply(coef_div, 1, sd)
  se_sort  <- apply(coef_sort, 1, sd)
  
  # Calculate confidence intervals
  CIboot_lower_mean  <- apply(coef_mean, 1, quantile, probs = 0.025)
  CIboot_upper_mean  <- apply(coef_mean, 1, quantile, probs = 0.975)
  
  CIboot_lower_div   <- apply(coef_div, 1, quantile, probs = 0.025)
  CIboot_upper_div   <- apply(coef_div, 1, quantile, probs = 0.975)
  
  CIboot_lower_sort <- apply(coef_sort, 1, quantile, probs = 0.025)
  CIboot_upper_sort <- apply(coef_sort, 1, quantile, probs = 0.975)
  
  # Calculate means of the bootstrapped coefficients
  coef_mean  <- apply(coef_mean, 1, mean)
  coef_div   <- apply(coef_div, 1, mean)
  coef_sort <- apply(coef_sort, 1, mean)
  
  # Extract R2 from the bootstraps
  R2_M <- sapply(boot_results, function(estimate) estimate$R2_Mean)
  R2_D <- sapply(boot_results, function(estimate) estimate$R2_Div)
  R2_A <- sapply(boot_results, function(estimate) estimate$R2_Sort)
  
  # Same for Adjusted R2 (we don't report SEs for adjusted R2)
  adj_R2_Mean  <- sapply(boot_results, function(estimate) estimate$adj.R2_Mean)
  adj_R2_Div   <- sapply(boot_results, function(estimate) estimate$adj.R2_Div)
  adj_R2_Sort  <- sapply(boot_results, function(estimate) estimate$adj.R2_Sort)
  
  # Compute means
  R2_Mean  <- mean(R2_M)
  R2_Div   <- mean(R2_D)
  R2_Sort  <- mean(R2_A)
  
  # Same for adj R2
  adj_R2_Mean  <- mean(adj_R2_Mean)
  adj_R2_Div   <- mean(adj_R2_Div)
  adj_R2_Sort  <- mean(adj_R2_Sort)
  
  # Compute standard deviation of R2
  R2_dev_Mean  <- sd(R2_M)
  R2_dev_Div   <- sd(R2_D)
  R2_dev_Sort  <- sd(R2_A)
  
  # Extract number of observations (these are identical across model specifications of mean, divergence, and sorting)
  N <- length(data_frame)
  
  # Create data frames to store the results
  param_Mean <- data.frame(
    coef       = coef_mean,                                                     #coefficients
    se         = se_mean,                                                       #standard errors
    p          = 2 * pt(-abs(coef_mean/se_mean), df = nrow(data_frame) - 2),    #p-values
    CI_lower   = CIboot_lower_mean,                                             #lower bound confidence intervals
    CI_upper   = CIboot_upper_mean,                                             #upper bound confidence intervals
    r2         = R2_Mean,                                                       #r2
    r2_dev     = R2_dev_Mean,
    adj_r2     = adj_R2_Mean,                                                   #adjusted r2
    N          = N,                                                             #number of observations
    model      = "Boot Mean");                                                  #model indicator
  param_Mean <- tibble::rownames_to_column(param_Mean, var = "term")            #predictor id
  
  param_Div <- data.frame(
    coef     = coef_div,
    se       = se_div,
    p        = 2 * pt(-abs(coef_div/se_div), df = nrow(data_frame) - 2),
    CI_lower = CIboot_lower_div,
    CI_upper = CIboot_upper_div,
    r2       = R2_Div,
    r2_dev   = R2_dev_Div,
    adj_r2   = adj_R2_Div,
    N        = N,
    model    = "Boot Divergence");
  param_Div <- tibble::rownames_to_column(param_Div, var = "term")
  
  param_Sort <- data.frame(
    coef     = coef_sort,
    se       = se_sort,
    p        = 2 * pt(-abs(coef_sort/se_sort), df = nrow(data_frame) - 2),
    CI_lower = CIboot_lower_sort,
    CI_upper = CIboot_upper_sort,
    r2       = R2_Sort,
    r2_dev   = R2_dev_Sort,
    adj_r2   = adj_R2_Sort,
    N        = N,
    model    = "Boot Sorting");
  param_Sort <- tibble::rownames_to_column(param_Sort, var = "term")
  
  # Merge and clean up data frames
  param <- rbind.data.frame(param_Mean, param_Div, param_Sort)
  
  param <- param %>%
    mutate(term = case_when(
      term == "sort_iv" ~ "Variable",
      term == "protcattend" ~ "Protestant",
      term == "white" ~ "White",
      term == "college" ~ "College",
      term == "sort_iv:protcattend" ~ ":Protestant",
      term == "sort_iv:white" ~ ":White",
      term == "sort_iv:college" ~ ":College",
      term == "(Intercept)" ~ "Intercept",
      TRUE ~ term
    ))
  # Clean up term
  param$term <- gsub("sort_iv", "", param$term)
  param$term <- gsub("genPew", "Cohort ", param$term)
  param$term <- gsub("genDec", "Cohort ", param$term)
  param$term <- gsub("year", "Year ", param$term)
  param$term <- gsub("ageGroup", "Age ", param$term)
  param$term <- ifelse(grepl(":", param$term), 
                       paste0(":", sub("^(.*):$", "\\1", param$term)), 
                       param$term)
  param$term <- gsub(":{2,}", ":", param$term)
  
  # Save object
  return(param)
}

# A little cumbersome, but this prevents running the robustness models both as boot and OLS while keeping the extract function somewhat readable
extract_short <- function(boot_results, data_frame) {
  
  # Extract coefficients for mean, div, and sort
  coef_mean  <- sapply(boot_results, function(estimate) estimate$coef_Mean)
  coef_div   <- sapply(boot_results, function(estimate) estimate$coef_Div)
  coef_sort  <- sapply(boot_results, function(estimate) estimate$coef_Sort)
  
  # Calculate standard deviations
  se_mean  <- apply(coef_mean, 1, sd)
  se_div   <- apply(coef_div, 1, sd)
  se_sort  <- apply(coef_sort, 1, sd)
  
  # Calculate confidence intervals
  CIboot_lower_mean  <- apply(coef_mean, 1, quantile, probs = 0.025)
  CIboot_upper_mean  <- apply(coef_mean, 1, quantile, probs = 0.975)
  
  CIboot_lower_div   <- apply(coef_div, 1, quantile, probs = 0.025)
  CIboot_upper_div   <- apply(coef_div, 1, quantile, probs = 0.975)
  
  CIboot_lower_sort <- apply(coef_sort, 1, quantile, probs = 0.025)
  CIboot_upper_sort <- apply(coef_sort, 1, quantile, probs = 0.975)
  
  # Calculate means of the bootstrapped coefficients
  coef_mean  <- apply(coef_mean, 1, mean)
  coef_div   <- apply(coef_div, 1, mean)
  coef_sort <- apply(coef_sort, 1, mean)
  
  # Extract R2 from the bootstraps
  R2_M <- sapply(boot_results, function(estimate) estimate$R2_Mean)
  R2_D <- sapply(boot_results, function(estimate) estimate$R2_Div)
  R2_A <- sapply(boot_results, function(estimate) estimate$R2_Sort)
  
  # Same for Adjusted R2
  adj_R2_Mean  <- sapply(boot_results, function(estimate) estimate$adj.R2_Mean)
  adj_R2_Div   <- sapply(boot_results, function(estimate) estimate$adj.R2_Div)
  adj_R2_Sort <- sapply(boot_results, function(estimate) estimate$adj.R2_Sort)
  
  # Compute means
  R2_Mean  <- mean(R2_M)
  R2_Div   <- mean(R2_D)
  R2_Sort <- mean(R2_A)
  
  # Adj R2
  adj_R2_Mean  <- mean(adj_R2_Mean)
  adj_R2_Div   <- mean(adj_R2_Div)
  adj_R2_Sort <- mean(adj_R2_Sort)
  
  # Compute standard deviation of R2
  R2_dev_Mean  <- sd(R2_M)
  R2_dev_Div   <- sd(R2_D)
  R2_dev_Sort <- sd(R2_A)
  
  # Extract number of observations (these are identical across model specifications of mean, divergence, and sorting)
  N <- sapply(boot_results, function(estimate) estimate$nobs)
  N <- unique(N)
  
  # Create data frames to store the results
  param_Mean <- data.frame(
    term       = rownames(data.frame(coef_mean)),                             
    coef       = coef_mean,                                                     #coefficients
    se         = se_mean,                                                       #standard errors
    p          = 2 * pt(-abs(coef_mean/se_mean), df = nrow(data_frame) - 2),    #p-values
    CI_lower   = CIboot_lower_mean,                                             #lower bound confidence intervals
    CI_upper   = CIboot_upper_mean,                                             #upper bound confidence intervals
    r2         = rep(R2_Mean, length(coef_mean)),                               #r2
    r2_dev     = rep(R2_dev_Mean, length(coef_mean)),
    adj_r2     = rep(adj_R2_Mean, length(coef_mean)),                           #adjusted r2
    N          = rep(N, length(coef_mean)),                                     #number of observations                                                   
    model      = "Boot Mean")                                                   #model indicator
  
  param_Div <- data.frame(
    term     = rownames(data.frame(coef_div)),
    coef     = coef_div,
    se       = se_div,
    p        = 2 * pt(-abs(coef_div/se_div), df = nrow(data_frame) - 2),
    CI_lower = CIboot_lower_div,
    CI_upper = CIboot_upper_div,
    r2       = rep(R2_Div, length(coef_div)),
    r2_dev   = rep(R2_dev_Div, length(coef_div)),
    adj_r2   = rep(adj_R2_Div, length(coef_div)),
    N        = rep(N, length(coef_div)),
    model    = "Boot Divergence")
  
  param_Sort <- data.frame(
    term     = rownames(data.frame(coef_sort)),
    coef     = coef_sort,
    se       = se_sort,
    p        = 2 * pt(-abs(coef_sort/se_sort), df = nrow(data_frame) - 2),
    CI_lower = CIboot_lower_sort,
    CI_upper = CIboot_upper_sort,
    r2       = rep(R2_Sort, length(coef_sort)),
    r2_dev   = rep(R2_dev_Sort, length(coef_sort)),
    adj_r2   = rep(adj_R2_Sort, length(coef_sort)),
    N        = rep(N, length(coef_sort)),
    model    = "Boot Sorting")
  
  # Merge and clean up data frames
  param <- rbind.data.frame(param_Mean, param_Div, param_Sort)
  
  param <- param %>%
    mutate(term = case_when(
      term == "sort_iv" ~ "Variable",
      term == "protcattend" ~ "Protestant",
      term == "white" ~ "White",
      term == "college" ~ "College",
      term == "sort_iv:protcattend" ~ ":Protestant",
      term == "sort_iv:white" ~ ":White",
      term == "sort_iv:college" ~ ":College",
      term == "(Intercept)" ~ "Intercept",
      TRUE ~ term
    ))
  # Clean up term
  param$term <- gsub("sort_iv", "", param$term)
  param$term <- gsub("genPew", "Cohort ", param$term)
  param$term <- gsub("genDec", "Cohort ", param$term)
  param$term <- gsub("year", "Year ", param$term)
  param$term <- gsub("ageGroup", "Age ", param$term)
  param$term <- ifelse(grepl(":", param$term), 
                       paste0(":", sub("^(.*):$", "\\1", param$term)), 
                       param$term)
  param$term <- gsub(":{2,}", ":", param$term)
  
  return(param)
}

# Compute proportion respondents by category (note: only run for visuals, i.e., main models)
prop <- function(data_frame, var_value, gen) {
  g_prop <- data.frame(prop.table(table(data_frame[[gen]])))
  y_prop <- data.frame(prop.table(table(data_frame$year)))
  a_prop <- data.frame(prop.table(table(data_frame$ageGroup)))
  
  c_prop <- data.frame(prop.table(table(data_frame$college)))
  p_prop <- data.frame(prop.table(table(data_frame$protcattend)))
  w_prop <- data.frame(prop.table(table(data_frame$white)))
  
  c_prop <- c_prop[-1, ]
  p_prop <- p_prop[-1, ]
  w_prop <- w_prop[-1, ]
  
  c_prop$Var1 <- c("College")
  p_prop$Var1 <- c("Protestant")
  w_prop$Var1 <- c("White")
  
  prop_df <- plyr::rbind.fill(g_prop, y_prop, a_prop, 
                              c_prop, p_prop, w_prop)
  prop_df <- rbind(prop_df, transform(prop_df, Var1 = paste0(":", Var1)))
  
  # Add the 'var' column with the specified value
  prop_df$var  <- var_value
  prop_df$term <- as.character(prop_df$Var1)
  prop_df$Var1 <- NULL
  
  return(prop_df)
}

# Force decimals to a specific number, used for tabular output
format_fixed_decimals <- function(x, decimals = 2) {
  sprintf(paste0("%.", decimals, "f"), x)
}

# Another function for tabular output
make_Tab <- function(data_frame) {
  data_frame %>%
    select(term, coef, se, CI_lower, CI_upper, p, r2, r2_dev, adj_r2, N, model)
}

# Function to clean data for tabular output
clean_Tab <- function(data_frame) {
  data_frame %>%
    mutate(term = gsub(":_C", "", term),
           term = case_when(
             term == "_C" ~ "Variable",
             term == ":white" ~ ":White",
             term == ":college" ~ ":College", 
             term == ":protcattend" ~ ":Protestant",
             TRUE ~ term
           )) %>%
    mutate(group = case_when(
      startsWith(term, "Cohort") ~ "Cohort",
      startsWith(term, "Age") ~ "Age",
      startsWith(term, "Year") ~ "Year",
      startsWith(term, ":Cohort") ~ "Cohort",
      startsWith(term, ":Age") ~ "Age",
      startsWith(term, ":Year") ~ "Year",
      startsWith(term, ":_C:Cohort") ~ "Cohort",
      startsWith(term, ":_C:Age") ~ "Age",
      startsWith(term, ":_C:Year") ~ "Year",
      startsWith(term, "White") ~ "White",
      startsWith(term, "College") ~ "College",
      startsWith(term, "Year") ~ "Protestant",
      startsWith(term, ":White") ~ "White",
      startsWith(term, ":College") ~ "College",
      startsWith(term, ":Protestant") ~ "Protestant",
      startsWith(term, "genSub") ~ "Cohort",
      startsWith(term, "genAdd") ~ "Cohort",
      startsWith(term, "genSub") ~ ":Cohort",
      startsWith(term, "genAdd") ~ ":Cohort",
      TRUE ~ term
    )) %>%
    mutate(boot = case_when(
      startsWith(model, "Boot") ~ "Boot",
      TRUE ~ "OLS"
    )) %>%
    mutate(term = gsub("Cohort |Age |Year ", "", term),
           facet = gsub("Boot |Original ", "", model)) 
}
